Librares
library(mosaic)
library(tidyverse)
library(plotly)
library(reticulate)library(mosaic)
library(tidyverse)
library(plotly)
library(reticulate)food <- read_csv("data/food.csv")
food1 <- food %>%
select(c(GPA, father_education, mother_education))
food2 <- food1 %>%
mutate(
parent_education = case_when(
mother_education == 1 & father_education == 1 ~
"BHSD",
mother_education %in% c(1,2) & father_education == 2 |
mother_education == 2 & father_education == 1 ~
"1HS",
mother_education %in% c(1,2,3) & father_education == 3 |
mother_education == 3 & father_education %in% c(1,2) ~
"1SC",
mother_education %in% c(1,2,3,4) & father_education == 4 |
mother_education == 4 & father_education %in% c(1,2,3) ~
"1BD",
mother_education %in% c(1,2,3,4) & father_education == 5 |
mother_education == 5 & father_education %in% c(1,2,3,4) ~
"1GD",
mother_education == 5 & father_education == 5 ~
"BGD"
),
GPA = as.numeric(GPA)
) %>%
na.omit()
food2 <- food2 %>%
mutate(
parent_education = fct_relevel(parent_education,
"BHSD", "1HS", "1SC",
"1BD", "1GD", "BGD")
)
ggplot(food2, aes(x=parent_education, y=GPA)) +
geom_boxplot(fill=c('cyan3','deepskyblue','deepskyblue1','deepskyblue2','deepskyblue3','deepskyblue4')) +
geom_jitter(color="black", size=1, alpha=0.9, width= 0.25) +
labs(title= "How do Parent's Education Levels Effect College Student's GPAs?",
x= "Parent's Education Levels") +
theme_light()Both High school Dropouts (BHSD)
At least one high school graduate (1HS)
At least one did some college (1SC)
At least one bachelors degree (1BD)
At least one graduate degree (1GD)
Both graduate degrees (BGD)
To see full analysis this chart was used in: College GPA vs Parental Education
Mimicing a visualization from Our World in Data
My Recreation:
fruit <- read_csv("data/fruit.csv")
colnames(fruit) <- c('Entity', 'Code', 'Year', 'Fruit', 'GDP', 'Continent')
continent <- fruit %>%
filter(!is.na(Code)) %>%
group_by(Entity) %>%
fill(Continent, .direction='downup') %>%
ungroup()
library(ggrepel)
year <- continent %>%
filter(Year == 2020)
country5 <- year %>%
filter(Entity %in% c('Dominica',
'Dominican Republic',
'Guyana',
'Albania',
'Papua New Guinea',
'Ghana'))
ggplot(year, aes(x=GDP, y=Fruit, color=Continent)) +
geom_point(size=2, shape=1, color='gray70') +
geom_point(alpha=0.85) +
scale_x_continuous(trans='log',
breaks=c(1000, 2000, 5000, 10000, 20000, 50000, 100000),
labels=c("$1,000", "$2,000", "$5,000", "$10,000", "$20,000", "$50,000", "$100,000")) +
scale_y_continuous(limits = c(0,400),
expand = c(0,0),
breaks=seq(0, 350, 50),
labels=c('0 kg', '50 kg', '100 kg',
'150 kg', '200 kg', '250 kg',
'300 kg', '350 kg')) +
scale_color_manual(values = c('Africa' = "#9B559D",
'Asia' = "#32847E",
'Europe' = "#536A9D",
'North America' = "#D96C58",
'Oceania' = "#925026",
'South America' = "#802F39"))+
guides(color = guide_legend(override.aes = list(shape=15, alpha=1,size=3),
keyheight = 0.9,
keywidth = 0)) +
labs(title="Fruit consumption vs. GDP per capita, 2020",
subtitle="Average per capita fruit consumption, measured in kilograms per year versus\ngross domestic product (GDP) per capita, measured in constant international-$",
x="GDP per capita",
y="Fruit supply per person") +
theme_classic() +
theme(
panel.grid.major = element_line(linetype='dotted', color='gray70'),
axis.line.y = element_blank(),
axis.line.x = element_line(size=0.25),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
legend.title = element_blank(),
legend.justification = c(1,1),
axis.title.x = element_text(vjust=-1),
axis.title.y = element_text(vjust=4)
)Rent <- read_csv("data/rent.csv")
wRent <- Rent %>%
filter(Gender == 'F' & Price < 1000) %>%
mutate(
MilesToCampus = round(MilesToCampus, 2)
)
plot_ly(wRent,
x= ~MilesToCampus,
y= ~Capacity,
color=~Price,
colors=c("hotpink","hotpink4"),
size= ~Price,
text= ~paste(Apartment, "\n$", Price)) %>%
layout(title= "Womens BYU-I Approved Housing\nUnder $1000 per Semester",
xaxis=list(title="Miles to the Center of Campus"),
yaxis=list(title="Maximum Housing Capacity"))To see full analysis this chart was used in: Housing Analysis for Stephanie
import pandas as pd
import numpy as np
import altair as alt
from IPython.display import Markdown
from IPython.display import display
from tabulate import tabulatenames = pd.read_csv("https://github.com/byuidatascience/data4names/raw/master/data-raw/names_year/names_year.csv")
christian_names = names.query("name == ['Mary', 'Martha', 'Peter', 'Paul']")
christian_chart = alt.Chart(christian_names,
title = alt.Title(
"People Born Each Year",
subtitle= "with the names 'Martha', 'Mary', 'Paul', and 'Peter'"
)
).encode(
x = alt.X('year',
title = "Year")
.axis(format = "d"),
y = alt.Y('Total'),
color = 'name'
)
christian_chart.mark_line()flights = pd.read_json("https://github.com/byuidatascience/data4missing/raw/master/data-raw/flights_missing/flights_missing.json")
# Gets rid of characters and just leaves numbers
flights['num_of_delays_carrier'] = (
flights['num_of_delays_carrier'].str.replace(r'\D', '', regex=True)
)
# Replaces blank strings, -999, and "n/a" with the actual NaN value
flights = (flights
.replace(["", -999, "n/a"], np.nan)
.replace(["Febuary"], "February")
)
# Fills NaN values in num_of_delays_late_aircraft with the mean of the column
mean_late_air = flights.num_of_delays_late_aircraft.mean()
flights.num_of_delays_late_aircraft.fillna(mean_late_air, inplace=True)
# Fills NaN values with the month before them
flights.month.ffill(inplace=True)
totals = (flights
.groupby("airport_code")
.agg(
total_minutes_delayed =
("minutes_delayed_total", np.sum),
total_delays =
("num_of_delays_total", np.sum),
total_flights =
("num_of_flights_total", np.sum),
).assign(
total_hrs_delayed = lambda df: df.total_minutes_delayed / 60,
ave_hrs_delayed = lambda df: df.total_hrs_delayed / df.total_delays,
proportion_delayed = lambda df: df.total_delays / df.total_flights,
delay_rating = lambda df: df.proportion_delayed * df.ave_hrs_delayed
).sort_values('delay_rating', ascending=False)
.reset_index()
)
best_airport = alt.Chart(totals,
title= alt.Title(
"Airports Rated by Delay",
subtitle= "The higher the rating, the worse airport")).encode(
x = alt.X('delay_rating:Q', title="Delay Rating"),
y= alt.Y('airport_code:N', title="Airport Code", sort="-x"),
color=alt.Color('airport_code:N', legend=None).scale(scheme="tealblues")
).mark_bar()
best_airport